library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)
library(ggrepel)
source("../vcode_to_country.R")

load("data/demography_data_smoothed.Rdata")

# Clean source list
dat %<>%
  mutate(source = ifelse(source == "manually set",
                         "See Section II in text",
                         source))
dat <- country_from_vcode(dat)

# wikipedia pages
wiki_links <- rio::import("wiki_links.xlsx")
wiki_rows <- dat[grep("\\[\\d+\\]", dat$source), ]
dat <- dat[-grep("\\[\\d+\\]", dat$source), ]
sids <- sub("\\[(.*)\\]", "\\1", wiki_rows$source)
sids <- gsub("\\]\\[", ",", sids)
sids <- stringr::str_split(sids, ",")
wiki_sources <- rep(NA, length(sids))
for(i in seq_along(wiki_sources)){
  nm <- "Wikipedia (Original source: "
  tmp <- sids[i][[1]]
  for(j in seq_along(tmp)){
    src <- wiki_links$link[match(tmp[j], wiki_links$ref_no)]
    nm <- paste0(nm, src, ", ")
  }
  nm <- paste0(nm, ")")
  nm <- sub(", )", ")", nm)
  wiki_sources[i] <- nm
}
wiki_rows$source <- wiki_sources
dat <- bind_rows(dat, wiki_rows)

# other cleaning
dat$source <- sub("alesina", "Alesina et al. (2003)", dat$source)
dat$source <- sub("CIA$", "CIA World Factbook", dat$source)
dat$source <- sub("Worldfactbook", "World Factbook", dat$source)
dat$source <- sub("The World Factbook", "CIA World Factbook", dat$source)
dat$source <- sub("US census", "US Census", dat$source)
dat$source <- sub("wikipedia", "Wikipedia", dat$source)
dat$source <- sub(", census 2016", "", dat$source)
dat$source <- sub("World Statesmen$", "https://www.worldstatesmen.org", dat$source)
dat$source <- sub("Putterman and Weil$", "Putterman and Weil (2010)", dat$source)
dat$source <- sub("Robinson$", "Robinson, Glenn (Personal communication with authors dated October 9, 2018)", dat$source)

sources <- data.frame(source = levels(as.factor(dat$source)), stringsAsFactors = FALSE)
sources %<>%
  arrange(source) %>%
  mutate(source_id = 1:nrow(sources))

dat$source_id <- sources$source_id[match(dat$source, sources$source)]

# let's visualize the results
ids <- as.data.frame(as.numeric(as.character(levels(as.factor(dat$country_id)))))
names(ids) <- "country_id"
ids <- country_from_vcode(ids, vnames_path = "data/vnames.csv")
ids %<>%
  arrange(vdem_country_name)

dat %<>%
  mutate(eur_pct_est_smooth = 100 * eur_pct_est_smooth,
         eur_pct_est = 100 * eur_pct_est) 


dat %<>%
  filter(!country_id %in% c(435, 443)) #cook islands and niue

ids <- as.data.frame(as.numeric(as.character(levels(as.factor(dat$country_id)))))
names(ids) <- "country_id"
ids <- country_from_vcode(ids)
ids %<>%
  arrange(vdem_country_name)
n_ppp <- 6
n_groups <- ceiling(length(ids[[1]]) / n_ppp)

for(i in 1:n_groups){
  f_out <- paste0("output/appendix_e_group_", i, ".png")
  mi <- 1 + (6 * (i - 1))
  ma <- 6 + (6 * (i - 1))
  ma <- ifelse(ma > nrow(ids), nrow(ids), ma)
  g_id <- ids[mi:ma, ]
  foo <- dat[which(dat$country_id %in% g_id$country_id), ]
  p <- ggplot(foo, aes(x = year)) +
    geom_line(aes(y = eur_pct_est_smooth)) +
    geom_point(aes(y = eur_pct_est)) + 
    geom_text_repel(aes(y = eur_pct_est, label = source_id),
                    angle = 45,
                    segment.alpha = 0.25) +
    ylim(0, 100) +
    labs(x = "Year", y = "European Ancestry") + 
    theme(panel.background = element_blank(),
          axis.line = element_line(color = "black"),
          legend.position = "none") +
    facet_wrap( ~ vdem_country_name, nrow = 3)
  ggsave(p, file = f_out, dpi = 300, width = 8.5, height = 11, units = "in")
}

